from bs4 import BeautifulSoup
import urllib.request
import os
from selenium import webdriver
import pdfkit
class HTML_parser(object):
def __init__(self, tickers):
self.tickers = tickers
def find_links(self, tickers):
test = open('test.txt', 'w+')
dataclean = []
#for a in range(len(tickers)):
data = []
count=0
try:
urllib.request.urlretrieve("http://secfilings.nasdaq.com/filingsCompany.asp?SchParam=Ticker&SchValue="+str(tickers)+"&StartRow=1&EndRow=10000", 'test.txt')
except:
print(tickers + " is not a valid stock ticker")
soup = BeautifulSoup(test, 'html.parser')
table = soup.find_all(target="fetchFiling")
for a in table:
if '10-Q' in a['href']:
data.append(a['href'])
while count < len(data):
try:
dataclean.append("http://secfilings.nasdaq.com"+str(data[count]))
count = count+3 #This is IDIOSYNCRATIC to the nasdaq source, duplicates may not always come in pairs of 3
except:
None
return dataclean
class pdf_writer(object):
def __init__(self, tick_list):
#self.cleandata = cleandata
self.tick_list = tick_list
def write_pdf(self, tick_list):
dir=os.getcwd()
for b in range(len(tick_list)):
cleandata = HTML_parser(tick_list[b]).find_links(tick_list[b])
driver = webdriver.PhantomJS()
for a in range(len(cleandata)):
#Specify the name of the pdf file:
begin_date_index, end_date_index = str(cleandata[a]).find("RcvdDate=")+len("RcvdDate="), str(cleandata[a]).find("&CoName")
date = str(cleandata[a])[begin_date_index:end_date_index]
date = date.replace("/","_")
temp = dir+'/'+str(tick_list[b])+"_"+date+"_10_Q.pdf"
#Write the PDF
driver.get(cleandata[a])
driver.switch_to_frame(1)
pdfkit.from_url(driver.current_url, temp)
#Tickers is the user input, which we pass to the HTML_Parser class to clean up the links from the HTML file
input_tickers = input("For which tickers do you want the earnings reports for? Separate with a comma \n")
tickers = [x.strip() for x in input_tickers.split(',')]
pdf_writer(tickers).write_pdf(tickers)